Imports¶

In [1]:
# Third-Party Imports
import torch
import openai
from sentence_transformers import SentenceTransformer, util
from evaluate import load
from datasets import load_dataset

import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
import spacy

import requests as req
import wikipediaapi
from bs4 import BeautifulSoup

from dotenv import load_dotenv
load_dotenv()

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()

# Standard Imports
import os
import sys
import json
from string import punctuation
from math import log1p, inf

Utility Functions¶

Plotting¶

In [2]:
# Plotting functions
def plot_bar_data(*bars, x=None, title="", x_label="", y_label=""):
    fig = go.Figure(
        layout={
            "title": title,
            "xaxis": {"title": x_label},
            "yaxis": {"title": y_label},
            "barmode": "group"
        }, data=[
            go.Bar(name=f"{bar[0]}", x=x, y=bar[1])
            for bar in bars
        ])
    
    return fig

def create_bar(name, data):
    return (name, data)

Text Processing¶

In [3]:
# Tokenization
def tokenize(doc, remove_stopwords=True):
    banned = list(punctuation)
    
    if remove_stopwords:
        banned += nltk.corpus.stopwords.words("english")
    
    return [
        w.lower() for w in nltk.word_tokenize(doc)
        if w.lower() not in banned
    ]

DocSearcher Class¶

In [4]:
# DocSearcher Class
# Implementation of all NLP methods used by LAME for info extraction (incl. the WikiBot) in a single class
class DocSearcher():
    def __init__(self):
        self._corpus = dict()
        self._file_matches = 2
        self._sentence_matches = 1
        self._sent_transformer = SentenceTransformer(
            "sentence-transformers/all-MiniLM-L6-v2"
        )
    
    def view_corpus(self):
        return self._corpus

    def load_files(self, corpus):
        self._corpus = corpus
    
    def clear_files(self):
        self._corpus = dict()

    def search(self, query, s_method='tf-idf'):
        fnames = self._corpus.keys()

        if s_method == 'tf-idf':
            joint_context, ranked_sents = self._context_and_sents_idf(query, fnames)
            output_text = self._build_output_text(ranked_sents, inf)
            answer = ' '.join(nltk.sent_tokenize(output_text)[:self._sentence_matches])
            return answer
        
        joint_context, ranked_sents = self._context_and_sents_cosine(query, fnames)
                
        if s_method == "cosine_sim":
            output_text = self._build_output_text(ranked_sents, inf)
            answer = ' '.join(nltk.sent_tokenize(output_text)[:self._sentence_matches])
        elif s_method == "bert":
            output_text = self._build_output_text(ranked_sents, 2048)
            answer = self._run_model_bert(query, output_text)
        elif s_method == "openai":
            output_text = self._build_output_text(ranked_sents, 2500)
            answer = self._run_model_openai(query, output_text)
        
        return answer.strip()
    
    def _build_output_text(self, ranked_sents, max_length=512):
        output_text = ''

        for sent in ranked_sents:
            new_sent = sent[0]
            if len(nltk.word_tokenize(f'{output_text} {new_sent}')) <= max_length:
                output_text += f' {new_sent}'
            else:
                break

        return output_text
    
    def _run_model_bert(self, query, context):
        # Get API url and headers
        api_url = "https://api-inference.huggingface.co/models/bert-large-uncased-whole-word-masking-finetuned-squad"
        headers = {
            "Authorization": f"Bearer {os.getenv('HUGGING_FACE_API_KEY')}"
        }

        payload = {
            "inputs": {
                "question": query,
                "context": context
            }
        }

        data = json.dumps(payload)
        res = req.request("POST", api_url, headers=headers, data=data)

        content = json.loads(res.content.decode("utf-8"))
        answer = content.get("answer", None)
        if not answer:
            return f"Error:  {content.get('error', 'Something is wrong')}"
        return answer
        
    def _run_model_openai(self, query, text):
        openai.api_key = os.getenv("OPENAI_API_KEY")

        res = openai.Completion.create(
            model="text-davinci-003", 
            prompt=f"Context: {query} Query: {text}\n\nUsing only the context given, answer the query.", 
            temperature=0,
            max_tokens=500,
        )
        
        return res.choices[0].text

    def _context_and_sents_idf(self, query, fnames):
        idfs = self._compute_idfs(fnames)
        top_files = self._top_files_idf(query, idfs)

        joint_context = "\n".join(self._corpus[name] for name in top_files)

        ranked_sents = self._sent_rank_idf(query, joint_context, idfs)

        return joint_context, ranked_sents
    
    def _context_and_sents_cosine(self, query, fnames):
        top_files = self._top_files_cosine(query, fnames)
        joint_context = "\n".join(self._corpus[name] for name in top_files)

        ranked_sents = self._sent_rank_cosine(query, joint_context)

        return joint_context, ranked_sents

    def _cosine_similarity(self, text_1, text_2, model):
        embedding_1 = model.encode(text_1, convert_to_tensor=True)
        embedding_2 = model.encode(text_2, convert_to_tensor=True)
    
        return float(util.pytorch_cos_sim(embedding_1, embedding_2))
    
    def _compute_idfs(self, fnames):
        file_idfs = dict()
        unique_words = set()
        num_docs = len(fnames)

        for name in fnames:
            for sent in nltk.sent_tokenize(self._corpus[name]):
                unique_words = set().union(
                    unique_words, 
                    set(self._word_tokenize(sent))
                )
                
        for word in unique_words:
            num_apps = sum(1 for name in fnames if word in self._corpus[name])
            if num_apps > 0:
                file_idfs[word] = log1p(num_docs / num_apps)
        
        return file_idfs

    def _top_files_idf(self, query, idfs):
        tf_idfs = { fname: 0 for fname in self._corpus }

        query = self._word_tokenize(query)

        for w in query:
            for fname in self._corpus:
                tf_idfs[fname] += self._corpus[fname].count(w) * idfs.get(w, 0)
        
        ranked_files = sorted(
            tf_idfs.items(),
            key=lambda x: x[1],
            reverse=True
        )

        return [file[0] for file in ranked_files][:self._file_matches]
    
    def _top_files_cosine(self, query, fnames):
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

        ranked_files = sorted([
            (name, self._cosine_similarity(query, self._corpus[name], model))
            for name in fnames
        ], key=lambda x: x[1], reverse=True)

        return [file[0] for file in ranked_files][:self._file_matches]
    
    def _word_tokenize(self, words):
        banned = list(punctuation) + nltk.corpus.stopwords.words("english")

        return [
            w.lower() for w in nltk.word_tokenize(words)
            if w.lower() not in banned
        ]
    
    def _sent_rank_idf(self, query, context, idfs):
        query_set = set(self._word_tokenize(query))
        sent_scores = { sent: [0,0] for sent in nltk.sent_tokenize(context)}

        for sent in sent_scores:
            sent_set = set(self._word_tokenize(sent))
            common_words = query_set.intersection(sent_set)
            sent_scores[sent][0] += sum(idfs.get(w, 0) for w in common_words)
            sent_scores[sent][1] += len(common_words)
        
        ranked_sents = sorted(
            sent_scores.items(),
            key=lambda x: (x[1][0], x[1][1]),
            reverse=True
        )

        return [(sent, score[0]) for sent, score in ranked_sents]

    def _sent_rank_cosine(self, query, context):
        model = self._sent_transformer
        sent_scores = {
            sent: self._cosine_similarity(query, sent, model)
            for sent in nltk.sent_tokenize(context)
        }
    
        ranked_sents = sorted(
            sent_scores.items(),
            key = lambda x: x[1],
            reverse=True
        )
    
        return ranked_sents

Loading Data¶

In [5]:
def load_squad_data(subset_size=5):
    indices = np.random.randint(0, 10570, (subset_size,))
    squad = load_dataset(
        "squad", 
        split="validation", 
    ).select(indices)

    return squad

Method Evaluation¶

Helper Functions¶

In [6]:
def predict_sample(squad_sample, method):
    """
    Run an info extraction method on a single example from
    the SQuAD dataset.
    """
    # Get relevant properties from squad sample
    question = squad_sample["question"]
    title = squad_sample["title"]
    context = squad_sample["context"]
    sample_id = squad_sample["id"]
    
    # Initialise doc searcher
    doc_searcher = DocSearcher()
    
    # Build and load corpus for doc searcher
    doc_searcher.load_files({title: context})
    
    # Get predicted text
    pred_text = doc_searcher.search(question, method)
    doc_searcher.clear_files()
    
    # Build prediction object
    pred_obj = {"prediction_text": pred_text, "id": sample_id}
    
    # Build reference object
    ref_obj = {"answers": squad_sample["answers"]}
    ref_obj["id"] = sample_id
    
    return pred_obj, ref_obj

def predict_samples(squad_ds, method):
    """
    Run an info extraction method on multiple examples from
    the SQuAD dataset.
    """
    # Initialse lists for storing prediction and reference objects
    predictions = []
    references = []
    
    # Run method on all samples in dataset
    for sample in squad_ds:
        pred_obj, ref_obj = predict_sample(sample, method)
        predictions.append(pred_obj)
        references.append(ref_obj)
    
    return predictions, references

def evaluate_method(squad_ds, method, squad_metric):
    """
    Get the average exact match and F1 scores of an info
    extraction method after running it on a subset of SQuAD
    """
    # Get prediction and reference objects
    preds, refs = predict_samples(squad_ds, method)
    
    # Get results
    results = squad_metric.compute(predictions=preds, references=refs)
    return results

def visualise_results(results):
    """
    Take results from the evaluate_method function
    an create bar graphs to visualise them.
    """
    method_labels = {
        "tf-idf": "TF-IDF",
        "bert": "BERT",
        "openai": "OpenAI",
        "cosine_sim": "Cosine Similarity"
    }
    
    plots = dict()
    
    # Create plot for average scores
    x = [method_labels[r["method"]] for r in results]
    em_bar = create_bar("Average Exact Match Score", [r["avg_em"] for r in results])
    f1_bar = create_bar("Average F1 Score", [r["avg_f1"] for r in results])
    
    avg_score_plot = plot_bar_data(em_bar, f1_bar, x=x, title="Average Scores")
    plots["average_score_plot"] = avg_score_plot
    
    # Create plot for EM and F1 scores over multiple trials
    for r in results:
        x = [f"Sample #{i+1}" for i in range(len(r["f1_scores"]))]
        em_bar = create_bar("Exact Match Score", r["em_scores"])
        f1_bar = create_bar("F1 Score", r["f1_scores"])
        new_plot = plot_bar_data(
            em_bar, 
            f1_bar, 
            x=x, 
            title=f"Exact Match and F1 Scores for {method_labels[r['method']]}"
        )
        plots[f"{r['method']}_plot"] = new_plot
    
    return plots

Method Evaluator¶

In [7]:
def method_evaluator(methods, num_trials=10, dataset_size=10):
    """
    Evaluate several info extraction methods at once.
    """
    # Initialise results object
    results = [
        {
            "f1_scores": [],
            "em_scores": [],
            "method": m
        }
        for m in methods
    ]
    
    # Load squad evaluator
    squad_metric = load("squad")
    
    for t in range(num_trials):
        print(f"Trial #{t+1}")
        squad_ds = load_squad_data(dataset_size)
        for i, m in enumerate(methods):
            result = evaluate_method(squad_ds, m, squad_metric)
            results[i]["f1_scores"].append(result.get("f1", None))
            results[i]["em_scores"].append(result.get("exact_match", None))
    
    for i, _ in enumerate(results):
        results[i]["avg_f1"] = np.mean(results[i]["f1_scores"])
        results[i]["avg_em"] = np.mean(results[i]["em_scores"])
    
    return results

Evaluation¶

In [8]:
# Get results of evaluation of each info extraction method
results = method_evaluator(["tf-idf", "cosine_sim", "bert", "openai"], 10, 10)
results
Trial #1
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #2
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #3
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #4
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #5
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #6
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #7
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #8
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #9
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #10
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Out[8]:
[{'f1_scores': [13.35003685003685,
   11.878500861053205,
   22.95218816271448,
   21.09002569265053,
   24.03466794643265,
   15.938863698454492,
   15.533238204679293,
   18.12565464945294,
   17.194956894956896,
   16.67998429426143],
  'em_scores': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  'method': 'tf-idf',
  'avg_f1': 17.677811725469276,
  'avg_em': 0.0},
 {'f1_scores': [16.079617211196158,
   10.902597402597403,
   24.296992481203006,
   21.09002569265053,
   20.390145801910506,
   14.60553036512116,
   18.89377035545534,
   24.038210562008853,
   21.273499405078354,
   16.67998429426143],
  'em_scores': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  'method': 'cosine_sim',
  'avg_f1': 18.825037357148275,
  'avg_em': 0.0},
 {'f1_scores': [10.0,
   100.0,
   80.0,
   95.36842105263159,
   88.57142857142858,
   86.66666666666666,
   76.66666666666666,
   84.0,
   20.0,
   95.95238095238095],
  'em_scores': [10.0, 100.0, 80.0, 80.0, 80.0, 80.0, 70.0, 80.0, 20.0, 80.0],
  'method': 'bert',
  'avg_f1': 73.72255639097745,
  'avg_em': 68.0},
 {'f1_scores': [37.91301831235688,
   25.80210221876888,
   31.531937087019845,
   38.18693773623832,
   20.827731767367304,
   31.934221806562228,
   15.13217071466713,
   26.437768582884665,
   42.5697309226721,
   25.009489885629865],
  'em_scores': [10.0, 10.0, 0.0, 0.0, 0.0, 10.0, 0.0, 10.0, 10.0, 0.0],
  'method': 'openai',
  'avg_f1': 29.53451090341672,
  'avg_em': 5.0}]
In [9]:
# Get data visualisations of results
results_plots = visualise_results(results)
In [10]:
# Results for TF-IDF
results_plots["tf-idf_plot"]
In [11]:
# Results for cosine similarity
results_plots["cosine_sim_plot"]
In [12]:
# Results for BERT
results_plots["bert_plot"]
In [13]:
# Results for OpenAI
results_plots["openai_plot"]
In [14]:
# Average F1 and exact match scores for all methods
results_plots["average_score_plot"]
In [ ]: